library('tidyverse')
customers <- read.csv("C:/Users/ruth/Desktop/Olist/olist_customers_dataset.csv")
order.items <- read.csv("C:/Users/ruth/Desktop/Olist/olist_order_items_dataset.csv")
order.payments <- read.csv("C:/Users/ruth/Desktop/Olist/olist_order_payments_dataset.csv")
order.reviews <- read.csv("C:/Users/ruth/Desktop/Olist/olist_order_reviews_dataset.csv")
orders <- read.csv("C:/Users/ruth/Desktop/Olist/olist_orders_dataset.csv")
sellers <- read.csv("C:/Users/ruth/Desktop/Olist/olist_sellers_dataset.csv")
products <- read.csv("C:/Users/ruth/Desktop/Olist/olist_products_dataset.csv")
product.category <- read.csv("C:/Users/ruth/Desktop/Olist/product_category_name_translation.csv")

Changing the name of the product column

colnames(product.category)[1] <- "product_category_name"
head(product.category)

Joining the tables

prod <- products %>% inner_join(product.category)
Joining, by = "product_category_name"
olist <-customers %>%
        inner_join(orders) %>%
        inner_join(order.items) %>%
        inner_join(order.payments) %>%
        inner_join(order.reviews) %>%
        inner_join(sellers) %>%
        inner_join(prod) # %>%
Joining, by = "customer_id"
Joining, by = "order_id"
Joining, by = "order_id"
Joining, by = "order_id"
Joining, by = "seller_id"
Joining, by = "product_id"
        # inner_join(geolocation,by=c('customer_city'='geolocation_city'))
dim(olist)
[1] 116581     40

looking for missing values

colSums(is.na(olist))
                  customer_id            customer_unique_id      customer_zip_code_prefix                 customer_city 
                            0                             0                             0                             0 
               customer_state                      order_id                  order_status      order_purchase_timestamp 
                            0                             0                             0                             0 
            order_approved_at  order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date 
                            0                             0                             0                             0 
                order_item_id                    product_id                     seller_id           shipping_limit_date 
                            0                             0                             0                             0 
                        price                 freight_value            payment_sequential                  payment_type 
                            0                             0                             0                             0 
         payment_installments                 payment_value                     review_id                  review_score 
                            0                             0                             0                             0 
         review_comment_title        review_comment_message          review_creation_date       review_answer_timestamp 
                            0                             1                             0                             0 
       seller_zip_code_prefix                   seller_city                  seller_state         product_category_name 
                            0                             0                             0                             0 
          product_name_lenght    product_description_lenght            product_photos_qty              product_weight_g 
                            0                             0                             0                             1 
            product_length_cm             product_height_cm              product_width_cm product_category_name_english 
                            1                             1                             1                             0 

Plotting for missing values

library(VIM)
Loading required package: colorspace
Loading required package: grid
VIM is ready to use.

Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

Attaching package: 㤼㸱VIM㤼㸲

The following object is masked from 㤼㸱package:datasets㤼㸲:

    sleep
aggr_plot <- aggr(olist, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(olist), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

 Variables sorted by number of missings: 

Dropping the missing values

olist[rownames(na.omit(olist)),]
# rownames(olist)

Checking for duplicates

anyDuplicated(olist)
[1] 0

Dropping irrelevant columns

olist.copy <- olist
colnames(olist)
 [1] "customer_id"                   "customer_unique_id"            "customer_zip_code_prefix"     
 [4] "customer_city"                 "customer_state"                "order_id"                     
 [7] "order_status"                  "order_purchase_timestamp"      "order_approved_at"            
[10] "order_delivered_carrier_date"  "order_delivered_customer_date" "order_estimated_delivery_date"
[13] "order_item_id"                 "product_id"                    "seller_id"                    
[16] "shipping_limit_date"           "price"                         "freight_value"                
[19] "payment_sequential"            "payment_type"                  "payment_installments"         
[22] "payment_value"                 "review_id"                     "review_score"                 
[25] "review_comment_title"          "review_comment_message"        "review_creation_date"         
[28] "review_answer_timestamp"       "seller_zip_code_prefix"        "seller_city"                  
[31] "seller_state"                  "product_category_name"         "product_name_lenght"          
[34] "product_description_lenght"    "product_photos_qty"            "product_weight_g"             
[37] "product_length_cm"             "product_height_cm"             "product_width_cm"             
[40] "product_category_name_english"
olist <- olist.copy[,-c(1,2,3,5,6,13,14,15,18,25,23,26,28,29,31,40)]

colnames(olist)
 [1] "customer_city"                 "order_status"                  "order_purchase_timestamp"     
 [4] "order_approved_at"             "order_delivered_carrier_date"  "order_delivered_customer_date"
 [7] "order_estimated_delivery_date" "shipping_limit_date"           "price"                        
[10] "payment_sequential"            "payment_type"                  "payment_installments"         
[13] "payment_value"                 "review_score"                  "review_creation_date"         
[16] "seller_city"                   "product_category_name"         "product_name_lenght"          
[19] "product_description_lenght"    "product_photos_qty"            "product_weight_g"             
[22] "product_length_cm"             "product_height_cm"             "product_width_cm"             

Save to a CSV file.

write.csv(olist,'olist2.csv')

Renaming the columns that are misspelled

colnames(olist)
 [1] "customer_city"                 "order_status"                  "order_purchase_timestamp"     
 [4] "order_approved_at"             "order_delivered_carrier_date"  "order_delivered_customer_date"
 [7] "order_estimated_delivery_date" "shipping_limit_date"           "price"                        
[10] "payment_sequential"            "payment_type"                  "payment_installments"         
[13] "payment_value"                 "review_score"                  "review_creation_date"         
[16] "seller_city"                   "product_category_name"         "product_name_length"          
[19] "product_description_length"    "product_photos_qty"            "product_weight_g"             
[22] "product_length_cm"             "product_height_cm"             "product_width_cm"             

Checking for outliers in the numeric columns

#get the numeric columns
num.col <- Filter(is.numeric, olist)
for (i in 1:length(num.col)) {
  boxplot(num.col[,i], main=names(num.col[i]), type="1")
  print(i)
}
[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

UNIVARIATE ANALYSIS

#Getting the general overview of the data
summary(olist)
 customer_city      order_status       order_purchase_timestamp order_approved_at  order_delivered_carrier_date
 Length:116581      Length:116581      Length:116581            Length:116581      Length:116581               
 Class :character   Class :character   Class :character         Class :character   Class :character            
 Mode  :character   Mode  :character   Mode  :character         Mode  :character   Mode  :character            
                                                                                                               
                                                                                                               
                                                                                                               
                                                                                                               
 order_delivered_customer_date order_estimated_delivery_date shipping_limit_date     price         payment_sequential
 Length:116581                 Length:116581                 Length:116581       Min.   :   0.85   Min.   : 1.000    
 Class :character              Class :character              Class :character    1st Qu.:  39.90   1st Qu.: 1.000    
 Mode  :character              Mode  :character              Mode  :character    Median :  74.90   Median : 1.000    
                                                                                 Mean   : 120.75   Mean   : 1.093    
                                                                                 3rd Qu.: 134.90   3rd Qu.: 1.000    
                                                                                 Max.   :6735.00   Max.   :29.000    
                                                                                                                     
 payment_type       payment_installments payment_value      review_score   review_creation_date seller_city       
 Length:116581      Min.   : 0.00        Min.   :    0.0   Min.   :1.000   Length:116581        Length:116581     
 Class :character   1st Qu.: 1.00        1st Qu.:   61.0   1st Qu.:4.000   Class :character     Class :character  
 Mode  :character   Median : 2.00        Median :  108.2   Median :5.000   Mode  :character     Mode  :character  
                    Mean   : 2.95        Mean   :  172.9   Mean   :4.018                                          
                    3rd Qu.: 4.00        3rd Qu.:  189.7   3rd Qu.:5.000                                          
                    Max.   :24.00        Max.   :13664.1   Max.   :5.000                                          
                                                                                                                  
 product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm
 Length:116581         Min.   : 5.00       Min.   :   4.0             Min.   : 1.000     Min.   :    0    Min.   :  7.00   
 Class :character      1st Qu.:42.00       1st Qu.: 346.0             1st Qu.: 1.000     1st Qu.:  300    1st Qu.: 18.00   
 Mode  :character      Median :52.00       Median : 600.0             Median : 1.000     Median :  700    Median : 25.00   
                       Mean   :48.77       Mean   : 785.9             Mean   : 2.205     Mean   : 2116    Mean   : 30.32   
                       3rd Qu.:57.00       3rd Qu.: 983.0             3rd Qu.: 3.000     3rd Qu.: 1800    3rd Qu.: 38.00   
                       Max.   :76.00       Max.   :3992.0             Max.   :20.000     Max.   :40425    Max.   :105.00   
                                                                                         NA's   :1        NA's   :1        
 product_height_cm product_width_cm product_category_name_english
 Min.   :  2.00    Min.   :  6.00   Length:116581                
 1st Qu.:  8.00    1st Qu.: 15.00   Class :character             
 Median : 13.00    Median : 20.00   Mode  :character             
 Mean   : 16.65    Mean   : 23.12                                
 3rd Qu.: 20.00    3rd Qu.: 30.00                                
 Max.   :105.00    Max.   :118.00                                
 NA's   :1         NA's   :1                                     
desc_stats <- data.frame(
  min = apply(num.col, 2, min),
  median = apply(num.col, 2, median),
  mean_df = apply(num.col, 2, mean),
  SD = apply(num.col, 2, sd),
  max = apply(num.col, 2, max)
)
desc_stats <- round(desc_stats,1)
head(desc_stats)
library(dplyr)
library(inspectdf)
package 㤼㸱inspectdf㤼㸲 was built under R version 4.0.3
inspect_cat(olist)

 Column (2/12):  order_status
 Column (3/12):  order_purchase_timestamp
                                                                                
 Column (4/12):  order_approved_at
 Column (5/12):  order_delivered_carrier_date
 Column (6/12):  order_delivered_customer_date
 Column (7/12):  order_estimated_delivery_date
                                                                                
 Column (8/12):  shipping_limit_date
                                                                                
 Column (9/12):  payment_type
 Column (10/12):  review_creation_date
                                                                                
 Column (11/12):  seller_city
 Column (12/12):  product_category_name
                                                                                

common_pcnt, the percentage of each column occupied by the most common level shown in common.

Bivariate Analysis visualization

LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQphdXRob3I6IGV5YW4NCmRhdGU6IDI1LzAxLzIwMjENCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQpgYGB7cn0NCmxpYnJhcnkoJ3RpZHl2ZXJzZScpDQpgYGANCg0KDQpgYGB7cn0NCmN1c3RvbWVycyA8LSByZWFkLmNzdigiQzovVXNlcnMvcnV0aC9EZXNrdG9wL09saXN0L29saXN0X2N1c3RvbWVyc19kYXRhc2V0LmNzdiIpDQpvcmRlci5pdGVtcyA8LSByZWFkLmNzdigiQzovVXNlcnMvcnV0aC9EZXNrdG9wL09saXN0L29saXN0X29yZGVyX2l0ZW1zX2RhdGFzZXQuY3N2IikNCm9yZGVyLnBheW1lbnRzIDwtIHJlYWQuY3N2KCJDOi9Vc2Vycy9ydXRoL0Rlc2t0b3AvT2xpc3Qvb2xpc3Rfb3JkZXJfcGF5bWVudHNfZGF0YXNldC5jc3YiKQ0Kb3JkZXIucmV2aWV3cyA8LSByZWFkLmNzdigiQzovVXNlcnMvcnV0aC9EZXNrdG9wL09saXN0L29saXN0X29yZGVyX3Jldmlld3NfZGF0YXNldC5jc3YiKQ0Kb3JkZXJzIDwtIHJlYWQuY3N2KCJDOi9Vc2Vycy9ydXRoL0Rlc2t0b3AvT2xpc3Qvb2xpc3Rfb3JkZXJzX2RhdGFzZXQuY3N2IikNCnNlbGxlcnMgPC0gcmVhZC5jc3YoIkM6L1VzZXJzL3J1dGgvRGVza3RvcC9PbGlzdC9vbGlzdF9zZWxsZXJzX2RhdGFzZXQuY3N2IikNCnByb2R1Y3RzIDwtIHJlYWQuY3N2KCJDOi9Vc2Vycy9ydXRoL0Rlc2t0b3AvT2xpc3Qvb2xpc3RfcHJvZHVjdHNfZGF0YXNldC5jc3YiKQ0KcHJvZHVjdC5jYXRlZ29yeSA8LSByZWFkLmNzdigiQzovVXNlcnMvcnV0aC9EZXNrdG9wL09saXN0L3Byb2R1Y3RfY2F0ZWdvcnlfbmFtZV90cmFuc2xhdGlvbi5jc3YiKQ0KYGBgDQoNCkNoYW5naW5nIHRoZSBuYW1lIG9mIHRoZSBwcm9kdWN0IGNvbHVtbg0KDQpgYGB7cn0NCmNvbG5hbWVzKHByb2R1Y3QuY2F0ZWdvcnkpWzFdIDwtICJwcm9kdWN0X2NhdGVnb3J5X25hbWUiDQpoZWFkKHByb2R1Y3QuY2F0ZWdvcnkpDQpgYGANCkpvaW5pbmcgdGhlIHRhYmxlcw0KDQpgYGB7cn0NCnByb2QgPC0gcHJvZHVjdHMgJT4lIGlubmVyX2pvaW4ocHJvZHVjdC5jYXRlZ29yeSkNCg0Kb2xpc3QgPC1jdXN0b21lcnMgJT4lDQogICAgICAgIGlubmVyX2pvaW4ob3JkZXJzKSAlPiUNCiAgICAgICAgaW5uZXJfam9pbihvcmRlci5pdGVtcykgJT4lDQogICAgICAgIGlubmVyX2pvaW4ob3JkZXIucGF5bWVudHMpICU+JQ0KICAgICAgICBpbm5lcl9qb2luKG9yZGVyLnJldmlld3MpICU+JQ0KICAgICAgICBpbm5lcl9qb2luKHNlbGxlcnMpICU+JQ0KICAgICAgICBpbm5lcl9qb2luKHByb2QpICMgJT4lDQogICAgICAgICMgaW5uZXJfam9pbihnZW9sb2NhdGlvbixieT1jKCdjdXN0b21lcl9jaXR5Jz0nZ2VvbG9jYXRpb25fY2l0eScpKQ0KZGltKG9saXN0KQ0KYGBgDQoNCmxvb2tpbmcgZm9yIG1pc3NpbmcgdmFsdWVzDQoNCmBgYHtyfQ0KY29sU3Vtcyhpcy5uYShvbGlzdCkpDQpgYGANClBsb3R0aW5nIGZvciBtaXNzaW5nIHZhbHVlcw0KYGBge3J9DQpsaWJyYXJ5KFZJTSkNCmFnZ3JfcGxvdCA8LSBhZ2dyKG9saXN0LCBjb2w9YygnbmF2eWJsdWUnLCdyZWQnKSwgbnVtYmVycz1UUlVFLCBzb3J0VmFycz1UUlVFLCBsYWJlbHM9bmFtZXMob2xpc3QpLCBjZXguYXhpcz0uNywgZ2FwPTMsIHlsYWI9YygiSGlzdG9ncmFtIG9mIG1pc3NpbmcgZGF0YSIsIlBhdHRlcm4iKSkNCmBgYA0KDQpEcm9wcGluZyB0aGUgbWlzc2luZyB2YWx1ZXMNCg0KYGBge3J9DQpvbGlzdFtyb3duYW1lcyhuYS5vbWl0KG9saXN0KSksXQ0KIyByb3duYW1lcyhvbGlzdCkNCmBgYA0KDQpDaGVja2luZyBmb3IgZHVwbGljYXRlcw0KDQpgYGB7cn0NCmFueUR1cGxpY2F0ZWQob2xpc3QpDQpgYGANCkRyb3BwaW5nIGlycmVsZXZhbnQgY29sdW1ucw0KYGBge3J9DQpvbGlzdC5jb3B5IDwtIG9saXN0DQpjb2xuYW1lcyhvbGlzdCkNCmBgYA0KYGBge3J9DQpvbGlzdCA8LSBvbGlzdC5jb3B5WywtYygxLDIsMyw1LDYsMTMsMTQsMTUsMTgsMjUsMjMsMjYsMjgsMjksMzEsNDApXQ0KDQpjb2xuYW1lcyhvbGlzdCkNCmBgYA0KDQpTYXZlIHRvIGEgQ1NWIGZpbGUuDQoNCmBgYHtyfQ0Kd3JpdGUuY3N2KG9saXN0LCdvbGlzdDIuY3N2JykNCmBgYA0KDQpSZW5hbWluZyB0aGUgY29sdW1ucyB0aGF0IGFyZSBtaXNzcGVsbGVkDQoNCmBgYHtyfQ0KY29sbmFtZXMob2xpc3QpWzE5XSA8LSAicHJvZHVjdF9kZXNjcmlwdGlvbl9sZW5ndGgiDQpjb2xuYW1lcyhvbGlzdClbMThdIDwtICJwcm9kdWN0X25hbWVfbGVuZ3RoIg0KY29sbmFtZXMob2xpc3QpDQpgYGANCg0KQ2hlY2tpbmcgZm9yIG91dGxpZXJzIGluIHRoZSBudW1lcmljIGNvbHVtbnMNCg0KYGBge3J9DQojZ2V0IHRoZSBudW1lcmljIGNvbHVtbnMNCm51bS5jb2wgPC0gRmlsdGVyKGlzLm51bWVyaWMsIG9saXN0KQ0KZm9yIChpIGluIDE6bGVuZ3RoKG51bS5jb2wpKSB7DQogIGJveHBsb3QobnVtLmNvbFssaV0sIG1haW49bmFtZXMobnVtLmNvbFtpXSksIHR5cGU9IjEiKQ0KICBwcmludChpKQ0KfQ0KDQpgYGANCg0KVU5JVkFSSUFURSBBTkFMWVNJUw0KDQpgYGB7cn0NCiNHZXR0aW5nIHRoZSBnZW5lcmFsIG92ZXJ2aWV3IG9mIHRoZSBkYXRhDQpzdW1tYXJ5KG9saXN0KQ0KYGBgDQoNCg0KYGBge3J9DQpkZXNjX3N0YXRzIDwtIGRhdGEuZnJhbWUoDQogIG1pbiA9IGFwcGx5KG51bS5jb2wsIDIsIG1pbiksDQogIG1lZGlhbiA9IGFwcGx5KG51bS5jb2wsIDIsIG1lZGlhbiksDQogIG1lYW5fZGYgPSBhcHBseShudW0uY29sLCAyLCBtZWFuKSwNCiAgU0QgPSBhcHBseShudW0uY29sLCAyLCBzZCksDQogIG1heCA9IGFwcGx5KG51bS5jb2wsIDIsIG1heCkNCikNCmRlc2Nfc3RhdHMgPC0gcm91bmQoZGVzY19zdGF0cywxKQ0KaGVhZChkZXNjX3N0YXRzKQ0KYGBgDQoNCg0KYGBge3J9DQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShpbnNwZWN0ZGYpDQoNCmluc3BlY3RfY2F0KG9saXN0KQ0KYGBgDQoqKmNvbW1vbl9wY250KiosIHRoZSBwZXJjZW50YWdlIG9mIGVhY2ggY29sdW1uIG9jY3VwaWVkIGJ5IHRoZSBtb3N0IGNvbW1vbiBsZXZlbCBzaG93biBpbiAqKmNvbW1vbi4qKg0KDQojIyBCaXZhcmlhdGUgQW5hbHlzaXMgdmlzdWFsaXphdGlvbg0KDQoNCg==